#---------------------------------------------------------------------------
# Author : Manasa Ramakrishna, mr325@le.ac.uk
# Date started : 1st June, 2017
# Last modified : 7th June, 2017
# Aim : To take a look at first SILAC labelled LOPIT data on Trizol
# Depends : On 'silacFunctions.R'. Make sure they are in the same directory
# Notes : Works on data from Rayner's first experiments
#---------------------------------------------------------------------------
# Invoking libraries
# source("https://bioconductor.org/biocLite.R")
# biocLite("Mus.musculus")
library(clusterProfiler)
## Loading required package: DOSE
##
## DOSE v3.0.10 For help: https://guangchuangyu.github.io/DOSE
##
## If you use DOSE in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics 2015, 31(4):608-609
## clusterProfiler v3.2.14 For help: https://guangchuangyu.github.io/clusterProfiler
##
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu., Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
library(ggplot2)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(limma)
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
##
## space
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
##
library(outliers)
library(RColorBrewer)
library(reshape2)
library(stringr)
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
library(wesanderson)
#Setting working directories
wd = "/Users/manasa/Documents/Work/TTT/02_Proteomics/01_First-SILAC-LOPIT/"
setwd(wd)
getwd()
## [1] "/Users/manasa/Documents/Work/TTT/02_Proteomics/01_First-SILAC-LOPIT"
indir = paste(wd,"Input",sep="/")
outdir = paste(wd,paste(Sys.Date(),"Output",sep = "_"),sep = "/")
if (exists(outdir)){
print("Outdir exists")
}else{
dir.create(outdir)
}
## Warning in dir.create(outdir): '/Users/manasa/Documents/Work/TTT/
## 02_Proteomics/01_First-SILAC-LOPIT//2017-06-14_Output' already exists
# Sourcing function file
source("silacFunctions.R")
# -----------
# Read data
# -----------
# File of contaminants - proteins to exclude from analysis as are things like keratin, alcohol dehydrogenase etc....
contam = read.delim("Input/Common contaminant_all.csv",sep=",",header=T)
# Read in the data files that contain peptide level output from Proteome discoverer...
# Modify the headers to be all lower case as well as remove unwanted spaces, symbols etc...to keep it simple
# Columns of interest are "sequence", "modifications","master.protein.accessions","abundance.heavy","abundance.light","abundance.ratio","quan.info"
infiles = grep("Trizol",list.files("Input/",full.names = T),value=T)
prot.data = NULL
for (i in infiles){
in.dat = read.delim(i,sep="\t",comment.char="",as.is=T,header=F)
in.dat$sample = strsplit(i,"//")[[1]][2]
print(i)
prot.data = rbind(prot.data,in.dat)
}
## [1] "Input//Trizol_150mJ_rep1.txt"
## [1] "Input//Trizol_150mJ_rep2.txt"
## [1] "Input//Trizol_150mJ_rep3.txt"
## [1] "Input//Trizol_275mJ_rep1.txt"
## [1] "Input//Trizol_275mJ_rep2.txt"
## [1] "Input//Trizol_275mJ_rep3.txt"
## [1] "Input//Trizol_400mJ_rep1.txt"
## [1] "Input//Trizol_400mJ_rep2.txt"
## [1] "Input//Trizol_400mJ_rep3.txt"
colnames(prot.data) = prot.data[1,]
dim(prot.data)
## [1] 81595 29
# Remove header lines as they differ in one of the columns (fraction number I think)
remove.head = which(prot.data[,1]=="Checked")
prot.data = prot.data[-(remove.head),]
dim(prot.data)
## [1] 81586 29
# Change header names a little to make them neutral and remove space, special characters
colnames(prot.data) = tolower(colnames(prot.data))
colnames(prot.data) = gsub(" ",".",colnames(prot.data))
colnames(prot.data) = gsub("#","no",colnames(prot.data))
colnames(prot.data) = gsub("\\:\\.f2\\:","n",colnames(prot.data))
colnames(prot.data)[12] = "theoretical.mass"
colnames(prot.data)[13] = "light.sample"
colnames(prot.data)[14] = "heavy.sample"
colnames(prot.data)[15] = "abundance.ratio.heavy.to.light"
colnames(prot.data)[16] = "abundance.light"
colnames(prot.data)[17] = "abundance.heavy"
colnames(prot.data)[29] = "sample"
# Convert abundance values to numeric from character
prot.data$abundance.heavy = as.numeric(prot.data$abundance.heavy)
prot.data$abundance.light = as.numeric(prot.data$abundance.light)
prot.data$abundance.ratio.heavy.to.light = as.numeric(prot.data$abundance.ratio.heavy.to.light)
# Add rep, reagent and UV amount columns
prot.data$uv = sapply(strsplit(prot.data$sample,"_"),"[[",2)
prot.data$repl = gsub(".txt","",gsub("rep","",sapply(strsplit(prot.data$sample,"_"),"[[",3)))
prot.data$repl = paste(prot.data$uv,prot.data$rep,sep=".")
prot.data$reagent = sapply(strsplit(prot.data$sample,"_"),"[[",1)
head(prot.data)
## checked confidence sequence modifications
## 2 FALSE High AGAHLQGGAK
## 3 FALSE High IMNTFSVVPSPK 1xLabel:13C(6)15N(2) [K12]
## 4 FALSE High IMNTFSVVPSPK
## 5 FALSE High NQVTQLKEQVPGFTPR
## 6 FALSE High EQELQQTLQQEQSVLDQLR
## 7 FALSE High TTPSVVAFTADGER
## qvality.pep qvality.q-value no.protein.groups no.proteins no.psms
## 2 5.61944E-06 0 1 2 16
## 3 3.4751E-06 0 4 11 8
## 4 2.42801E-06 0 4 11 6
## 5 0.000142696 0 1 2 4
## 6 1.97349E-07 0 1 1 6
## 7 9.80731E-06 0 1 3 4
## master.protein.accessions no.missed.cleavages theoretical.mass
## 2 P04406 0 909.4900898
## 3 Q13509; P04350; P07437; P68371 0 1327.716983
## 4 Q13509; P04350; P07437; P68371 0 1319.702784
## 5 F5H2F4 1 1841.986822
## 6 Q15149 0 2313.168093
## 7 P38646 0 1450.717248
## light.sample heavy.sample abundance.ratio.heavy.to.light abundance.light
## 2 High Peak Found 0.01 5200000
## 3 Not Found Not Found NA NA
## 4 Not Found Not Found NA NA
## 5 Not Found Not Found NA NA
## 6 High Not Found 0.01 656400
## 7 High Not Found NA NA
## abundance.heavy quan.info amanda.score.ms.amanda
## 2 43050 Unique 135.209857
## 3 NA No Quan Values 201.8662137
## 4 NA No Quan Values 176.2597659
## 5 NA No Quan Values 127.0712654
## 6 NA Unique 152.1398947
## 7 NA No Quan Values 104.2831076
## confidence.ms.amanda search.space.ms.amanda percolator.q-value.ms.amanda
## 2 High 1636 0
## 3 High 2601 0
## 4 High 2728 0
## 5 High 3048 0
## 6 High 3758 0
## 7 High 2414 0
## percolator.pep.ms.amanda ions.score.mascot confidence.mascot
## 2 0.00001018 65.04 High
## 3 0.000002614 75.28 High
## 4 0.00001715 79.87 High
## 5 0.00005757 40.09 High
## 6 1.928E-07 44.95 High
## 7 0.00002452 48.27 High
## search.space.mascot percolator.q-value.mascot percolator.pep.mascot
## 2 0 2.945E-07
## 3 0 1.695E-07
## 4 0 1.112E-07
## 5 0 0.00003356
## 6 0 1.211E-08
## 7 0 5.664E-07
## sample uv repl reagent
## 2 Trizol_150mJ_rep1.txt 150mJ 150mJ.1 Trizol
## 3 Trizol_150mJ_rep1.txt 150mJ 150mJ.1 Trizol
## 4 Trizol_150mJ_rep1.txt 150mJ 150mJ.1 Trizol
## 5 Trizol_150mJ_rep1.txt 150mJ 150mJ.1 Trizol
## 6 Trizol_150mJ_rep1.txt 150mJ 150mJ.1 Trizol
## 7 Trizol_150mJ_rep1.txt 150mJ 150mJ.1 Trizol
dim(prot.data)
## [1] 81586 32
prot.data has 32 columns and 81,586 rows - each row belonging to a peptide. We now go through a series of filtering steps to obtain a d
# ------------------
# Step 1 : Filter
# ------------------
# Step 1a : Filter only for those peptides that have a unique master protein. Done using column "quan.info" and titled 'Unique'
dim(prot.data)
## [1] 81586 32
peptide.stats = table(prot.data$sample,prot.data$quan.info)
peptide.stats
##
## No Quan Values Not Unique Unique
## Trizol_150mJ_rep1.txt 5211 287 4088
## Trizol_150mJ_rep2.txt 3869 248 3048
## Trizol_150mJ_rep3.txt 4571 247 3483
## Trizol_275mJ_rep1.txt 5266 289 3845
## Trizol_275mJ_rep2.txt 4303 212 3091
## Trizol_275mJ_rep3.txt 6578 468 6383
## Trizol_400mJ_rep1.txt 5365 286 4129
## Trizol_400mJ_rep2.txt 5203 265 3422
## Trizol_400mJ_rep3.txt 4320 219 2890
filt.1a = prot.data[which(prot.data$quan.info == "Unique"),]
length(which(filt.1a$quan.info == "Unique"))
## [1] 34379
dim(filt.1a) #34279 are unique proteins, 47207 are non-unique or are missing values
## [1] 34379 32
# This table is very odd. Rayner had an explanation but I'm still muddled. He said "High" was euivalent to "Peak found"
# However, there are peptides where it is "High" but the peptide values are NA. Hmmm....
table(light=filt.1a$light.sample,heavy=filt.1a$heavy.sample)
## heavy
## light High Not Found Peak Found
## High 0 15810 11775
## Not Found 1672 0 0
## Peak Found 5122 0 0
# Step 1b : Filter out those proteins that are contaminants from the contaminants list and annotate missing values
filt.1b = filt.1a[-which(filt.1a$master.protein.accessions %in% contam$Protein.Group.Accessions),]
num.contams = length(which(filt.1a$master.protein.accessions %in% contam$Protein.Group.Accessions))
# Annotate which peptides are missing heavy, light or both, abundance values
filt.1b$missing.val = rowSums(is.na(filt.1b[,c("abundance.heavy", "abundance.light")])) > 0
dim(filt.1a) # 34379 in total
## [1] 34379 32
dim(filt.1b) # 33657 filtered proteins
## [1] 33657 33
print(num.contams) # 722 contaminant proteins
## [1] 722
# Want to do some stats with missing values.
table(filt.1b$sample,filt.1b$missing.val) # More missing values in 150mJ_rep, 275mK_rep2 and 450mJ_rep3. However ration of missing/non-missing is same
##
## FALSE TRUE
## Trizol_150mJ_rep1.txt 1914 2097
## Trizol_150mJ_rep2.txt 1460 1523
## Trizol_150mJ_rep3.txt 1667 1723
## Trizol_275mJ_rep1.txt 1836 1915
## Trizol_275mJ_rep2.txt 1358 1670
## Trizol_275mJ_rep3.txt 3055 3175
## Trizol_400mJ_rep1.txt 2024 2031
## Trizol_400mJ_rep2.txt 1572 1797
## Trizol_400mJ_rep3.txt 1428 1412
round(table(filt.1b$sample,filt.1b$missing.val)/rowSums(table(filt.1b$sample,filt.1b$missing.val))*100,2)
##
## FALSE TRUE
## Trizol_150mJ_rep1.txt 47.72 52.28
## Trizol_150mJ_rep2.txt 48.94 51.06
## Trizol_150mJ_rep3.txt 49.17 50.83
## Trizol_275mJ_rep1.txt 48.95 51.05
## Trizol_275mJ_rep2.txt 44.85 55.15
## Trizol_275mJ_rep3.txt 49.04 50.96
## Trizol_400mJ_rep1.txt 49.91 50.09
## Trizol_400mJ_rep2.txt 46.66 53.34
## Trizol_400mJ_rep3.txt 50.28 49.72
# How many missing in heavy, how many missing in light
miss.l = table("_Missing light values_"=filt.1b$missing.val,filt.1b$light.sample)
miss.h = table("_Missing heavy values_"=filt.1b$missing.val,filt.1b$heavy.sample)
miss = cbind(miss.l,miss.h)
colnames(miss) = c("Light_High","Light_NotFound","Light_Found","Heavy_High","Heavy_NotFound","Heavy_Found")
rownames(miss) = c("notMissing","missing")
print(miss)
## Light_High Light_NotFound Light_Found Heavy_High Heavy_NotFound
## notMissing 11258 0 5056 5056 0
## missing 15644 1658 41 1699 15215
## Heavy_Found
## notMissing 11258
## missing 429
# Plot density plots
melt.1b = melt(filt.1b,id.vars = "repl", measure.vars = c("abundance.light", "abundance.heavy"))
ggplot(melt.1b,aes(x = log2(value+0.1))) + geom_density(aes(col = as.factor(variable)))+facet_wrap(~repl,ncol=3)+scale_colour_brewer(palette = "Dark2")
## Warning: Removed 17343 rows containing non-finite values (stat_density).
ggplot(filt.1b,aes(x = log2(abundance.ratio.heavy.to.light+0.1))) + geom_density(aes(col = as.factor(repl)))+facet_wrap(~repl,ncol=3)
ggplot(melt.1b,aes(x=repl,y = log2(value+0.1))) + geom_violin(aes(col = as.factor(variable)),draw_quantiles = c(0.25, 0.5, 0.75))
## Warning: Removed 17343 rows containing non-finite values (stat_ydensity).
We have a column called “missing.val” to identify which peptides have either a heavy or light abundance value missing. TRUE means it is missing one or both. FALSE means both values are present. A lot more “missing” values in the “heavy/non-crosslinked samples than”light/cross-linked samples“.
The above plots - density plots and violin plots include both missing and non-missing values. Hence, in the heavy/light density plots, you see a huge overlap in the curve with a tiny portion of the “light” curve going beyond the heavy curve. This is where we expect the interesting RNA binding proteins to lie. The next step however is to filter out the missing values.
# Step 1c : Filter out those peptides which are missing either "high" or "low" abundance values. We will look at this separately
# as we do not know for sure whether these are a result of extremely low signal due to enrichment or extremely low signal due to technical effects.
# There are 16314 peptides where we have quantification in both light and heavy abundance columns
filt.1c = filt.1b[which(rowSums(is.na(filt.1b[,c("abundance.heavy", "abundance.light")])) == 0),]
dim(filt.1c)
## [1] 16314 33
# Plot density plots
melt.1c = melt(filt.1c,id.vars = "repl", measure.vars = c("abundance.light", "abundance.heavy"))
ggplot(melt.1c,aes(x = log2(value+0.1))) + geom_density(aes(col = as.factor(variable)))+facet_wrap(~repl,ncol=3)+scale_colour_brewer(palette = "Dark2")
ggplot(filt.1c,aes(x = log2(abundance.ratio.heavy.to.light+0.1))) + geom_density(aes(col = as.factor(repl)))+facet_wrap(~repl,ncol=3)
ggplot(melt.1c,aes(x=repl,y = log2(value+0.1))) + geom_violin(aes(col = as.factor(variable)),draw_quantiles = c(0.5))
Once we remove peptides where the “heavy” or “light” value is missing, then there is a clear shift in the curve of intensity values for the “light” labelled sample which is our cross-linked sample and we hope it contains true RNA binding proteins. The median abundance for light samples is visibily higher than in heavy samples.
Note : It is important to remember that in a true experimental setting, we will not have SILAC labelling so we won’t have “heavy” and “light” values per peptide - all we will have is one abundance value. Rayner forced the mass spec to run as if it didn’t know about the SILAC labelling to parially emulate later experiments. However, we won’t be using the singleton data (heacvy only or light only) for the purposes of this initial analysis.
# ------------------
# Step 2 : Log-transform
# heavy = non-crosslinked
# light = crosslinked
# ------------------
# Log convert abundance values
filt.1c$heavy.log = log(filt.1c$abundance.heavy,2)
filt.1c$light.log = log(filt.1c$abundance.light,2)
# Generate an abundance ratio which for log transformed data is a subtraction
filt.1c$norm.abundance.ratio = filt.1c$light.log - filt.1c$heavy.log
# Data is checked
norm.data = filt.1c
dim(norm.data)
## [1] 16314 36
# Checking the counts of peaks with heavy and light values
table(light=norm.data$light.sample,heavy=norm.data$heavy.sample)
## heavy
## light High Peak Found
## High 0 11258
## Peak Found 5056 0
Once we have filtered the data to remove non-unique peptides and contaminants, we log transform (“normalise”) the data for heavy and light abundances. In addition, we subtract the logged abundances light-heavy to yield logged abundance ratios.
When we re-draw the table of heavy and light sample counts, we don’t have any “Not Found” values anymore. This was part of the exercise.
We have 16314 peptides that are present in both fractions. Analysing the difference in ratios between these two fractions are most likely to inform on whether or not they are enriched for RNA binding proteins.
# -----------------------------------------------
# Step 3 : Aggregating multiple peptides into one peptide
# heavy = non-crosslinked
# light = crosslinked
# -----------------------------------------------
# Subset the data to include columns with useful metadata and abundance ratios
# Transform the dataframe using 'melt' so the values for heavy and light are in one column. Can use this to draw plots comparing heavy to light values when necessary.
subset.cols = norm.data[,c("master.protein.accessions","sequence","modifications","repl","uv","missing.val","light.log","heavy.log","norm.abundance.ratio")]
dim(subset.cols)
## [1] 16314 9
# Tried various methods of aggregation
# Using sequence and repeat columns to aggregate
# Using all columns but the abundance ratio columns to aggregate
# Using mean, median or max to aggregate
# Using ddply as an alternative to aggregate
# Note : Finally, settled on taking the mean of the logged values and using 'aggregate' function
# We have 15356 unique peptide groups across all samples
agg.mean = aggregate(cbind(light.log,heavy.log,norm.abundance.ratio)~sequence+repl,data=subset.cols,FUN="mean")
agg.pep.table = table(agg.mean$sequence,agg.mean$repl)
table(agg.mean$repl) # 275mJ, replicate 3 has a lot more peptide groups than other samples
##
## 150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3
## 1802 1366 1590 1728 1275 2882 1894 1466 1353
# Now that peptides have been aggregated into peptide groups, re-calculate the missing value table...
write.table(agg.pep.table, paste(outdir,"Aggregated-pepides-no-missing-values.txt",sep="/"),sep="\t",quote=F)
# Will go with agg.mean for further analysis
agg = agg.mean
# I want to add protein annotations back to the aggregated data. Just want to make sure that one peptide doesn't map to multiple proteins. It shouldn't as we have only selected unique ones. Checked and is true.
for(i in 1:nrow(agg)){
agg$num.prot[i] = length(unique(subset.cols$master.protein.accessions[which(subset.cols$sequence == agg$sequence[i] & subset.cols$repl == agg$repl[i])]))
agg$accessions[i] = paste(unique(subset.cols$master.protein.accessions[which(subset.cols$sequence == agg$sequence[i] & subset.cols$repl == agg$repl[i])]),collapse=",")
}
head(agg)
## sequence repl light.log heavy.log
## 1 AAAAAAALQAK 150mJ.1 27.91815 24.91056
## 2 AAAETQSLR 150mJ.1 26.22839 22.15931
## 3 AAAMANNLQK 150mJ.1 24.43873 20.25003
## 4 AAEAAPPTQEAQGETEPTEQAPDALEQAADTSR 150mJ.1 19.35638 19.07300
## 5 AAGPISER 150mJ.1 22.39100 22.81245
## 6 AAGPSLSHTSGGTQSK 150mJ.1 20.91634 17.96804
## norm.abundance.ratio num.prot accessions
## 1 3.0075883 1 P36578
## 2 4.0690834 1 Q13523
## 3 4.1886989 1 Q14498
## 4 0.2833818 1 Q8N163
## 5 -0.4214498 1 Q15427
## 6 2.9483034 1 P27694
table(agg.mean$repl)
##
## 150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3
## 1802 1366 1590 1728 1275 2882 1894 1466 1353
# Temporarily recast data into a matrix to calculate correlations
for (t in c("light.log","heavy.log","norm.abundance.ratio")){
m = melt(agg,id.vars = c("sequence","repl"), measure.vars = t)
m.cast = dcast(m, sequence~repl+variable, fun.aggregate = mean)
cor.m = cor(m.cast[,2:ncol(m.cast)],use="pairwise.complete.obs")
colnames(cor.m) = gsub(paste("_",t,sep=""),"",colnames(cor.m))
rownames(cor.m) = gsub(paste("_",t,sep=""),"",rownames(cor.m))
heatmap.2(cor.m,trace = "none", dendrogram="none",col="redgreen", main=t)
}
Looking at the correlations for ‘heavy’ and ‘light’ abundance values across all replicates, it looks like the correlation is more within experimental replicates i.e high for rep1 of 150mJ, 275mJ, 400mJ than between 150mJ.rep1 and 150mJ.rep2 and so on.
# -----------------------------------------------
# Step 4 : Aggregating multiple peptides into one protein
# heavy = non-crosslinked
# light = crosslinked
# -----------------------------------------------
# We have 1262 unique proteins across all samples
agg.prot = aggregate(cbind(light.log,heavy.log,norm.abundance.ratio)~accessions+repl,data=agg,FUN="mean")
dim(agg.prot)
## [1] 5749 5
# Table of proteins vs samples - contingency to say which protein is present in which sample.
# Will help make overlaps
agg.prot.table = table(agg.prot$accessions,agg.prot$repl)
write.table(agg.prot.table, paste(outdir,"Aggregated-proteins-no-missing-values.txt",sep="/"),sep="\t",quote=F)
table(agg.prot$repl)
##
## 150mJ.1 150mJ.2 150mJ.3 275mJ.1 275mJ.2 275mJ.3 400mJ.1 400mJ.2 400mJ.3
## 676 544 600 648 511 943 696 594 537
There seem to be on average, ~640 proteins per sample in this experiment. 275mJ, rep3 has an unusually high number at 943. The samples at 150mJ of UV exposure have ~605 proteins, 275mJ have on average 716 proteins and 400mJ have on average 610 proteins.
# Now that we have the proteins, what are they doing functionally
# First let us look at the intersects within and between replicates
prot.matrix = as.data.frame.matrix(agg.prot.table)
print(dim(prot.matrix))
## [1] 1262 9
# Contains counts of overlap across 9 samples in various combinations
# Most intersections not very useful except that it tell us how many proteins overlap across all 9 samples = 211
prot.venn = venn(prot.matrix,show.plot=F)
isect = attr(prot.venn,"intersections")
# table of intersections
isect.count = t(as.data.frame(lapply(isect,length)))
colnames(isect.count) = "Count"
write.table(isect.count, paste(outdir,"Count-of-protein-overlaps-across-various-samples.txt",sep="/"),sep="\t",quote=F)
# Looking at overlaps within each uv dose - the more useful intersection exercise
add.int = NULL
# Looping through each uv dosage triplicate - 1:3, 4:6, 7:9
# add.int contains all intersections for each triplicate
for(k in c(1,4,7)){
print(k)
prot.venn.tmp = venn(prot.matrix[,k:(k+2)],show.plot=F)
vennDiagram(prot.matrix[,k:(k+2)],circle.col=c("turquoise", "salmon","palegreen"))
add.int = c(add.int,attr(prot.venn.tmp,"intersections"))
}
## [1] 1
## [1] 4
## [1] 7
write.table(t(as.data.frame(lapply(add.int,length))), paste(outdir,"Count-of-protein-overlaps-within-repl-for-each-dose.txt",sep="/"),sep="\t",quote=F)
The venn diagrams show the overlap of proteins within each uv dosage across replicates. There are between 350 and 370 overlapping proteins within each UV dosage. Across all 9 replicates, there are 211 proteins that we can extract as shown below. The next step is to map these proteins to some functional annotations. We will map each interaction group separately to see what it yields. Will use ‘clusterProfiler’ to do this.
isect[280] $
150mJ.1:150mJ.2:150mJ.3:275mJ.1:275mJ.2:275mJ.3:400mJ.1:400mJ.2:400mJ.3[1] “A0A024R4E5” “A0A087WUT6” “A0A087WVQ6” “A0A087X0X3” “A0A0A0MRV0” “A0A0C4DG17” “A0A0C4DG49” “A0A0D9SF53” “A0A0G2JNW7” “A0A0U1RRM4” “A8MXP9” “E7EVA0”
[13] “F5H2F4” “F8W930” “G8JLB6” “H3BLZ8” “J3KPP4” “J3KTA4” “M0QYS1” “O00203” “O00567” “O15479” “O43175” “O43390”
[25] “O43493” “O60506” “O75369” “O75400” “O75533” “O75534” “O76021” “O95218” “P02545” “P02786” “P04406” “P04792”
[37] “P05023” “P05556” “P06748” “P06756” “P07195” “P07237” “P07355” “P07737” “P07814” “P07900” “P08238” “P08621”
[49] “P08670” “P09429” “P09619” “P09651” “P0C7U0” “P10809” “P10909” “P11047” “P11142” “P11387” “P11940” “P13639”
[61] “P13667” “P14618” “P14625” “P14866” “P15880” “P16070” “P16989” “P18124” “P18583” “P18827” “P19338” “P20700”
[73] “P21399” “P22314” “P22626” “P23246” “P23396” “P23526” “P26006” “P26373” “P26641” “P30101” “P31942” “P31948”
[85] “P32004” “P35052” “P35579” “P35613” “P35659” “P36578” “P37802” “P38646” “P39023” “P42704” “P42892” “P43121”
[97] “P46777” “P46781” “P46976” “P47914” “P48634” “P49327” “P49411” “P49588” “P49756” “P50454” “P50895” “P50914”
[109] “P50990” “P51991” “P52597” “P53396” “P55011” “P55072” “P55290” “P55795” “P60709” “P61247” “P61978” “P62241”
[121] “P62249” “P62263” “P62269” “P62424” “P62753” “P62913” “P62995” “P67809” “P78527” “Q00839” “Q01105” “Q01130”
[133] “Q01844” “Q02878” “Q05519” “Q07065” “Q07666” “Q07954” “Q08170” “Q08211” “Q08945” “Q09666” “Q12849” “Q12906”
[145] “Q13148” “Q13151” “Q13243” “Q13263” “Q13283” “Q13641” “Q13740” “Q14103” “Q14108” “Q14152” “Q14315” “Q14444”
[157] “Q14498” “Q14690” “Q15061” “Q15084” “Q15149” “Q15233” “Q15287” “Q15717” “Q15758” “Q15904” “Q16629” “Q16658”
[169] “Q5BKZ1” “Q5T6F2” “Q6PD62” “Q6UVK1” “Q7KZF4” “Q7L2E3” “Q7L4I2” “Q7Z3B1” “Q86SJ2” “Q8N7H5” “Q8NC51” “Q8NE71”
[181] “Q8WVC0” “Q92541” “Q92879” “Q92945” “Q96AE4” “Q96I24” “Q96KR1” “Q96PK6” “Q96T37” “Q99700” “Q99714” “Q9BRL6”
[193] “Q9BUQ8” “Q9H307” “Q9NR30” “Q9NUM4” “Q9NWH9” “Q9NZB2” “Q9P121” “Q9UKM9” “Q9UQ35” “Q9UQ80” “Q9Y2W1” “Q9Y2X3”
[205] “Q9Y383” “Q9Y3Y2” “Q9Y490” “Q9Y4C8” “Q9Y4L1” “Q9Y520” “X5DQS5”
# KEGG enrichment for the intersections
# Let's see if we can work out whether the proteins are going up or down in light:heavy or crosslink:non-crosslink
# Main interactions of interest are (1) across all 9 samples (n = 211) (2) Overlap within each triplicate - 150mJ = 369, 275mJ = 358
prot.univ = bitr(unique(filt.1a$master.protein.accessions), fromType="UNIPROT", toType=c("ENTREZID"), OrgDb="org.Hs.eg.db") # 1157/1262 genes have Entrez Ids
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(unique(filt.1a$master.protein.accessions), fromType =
## "UNIPROT", : 10.7% of input gene IDs are fail to map...
across.9.kegg = enrichK(isect,280,agg.prot,0.05,outdir)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 9.09% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 5.21% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
across.150mJ = enrichK(add.int,"150mJ.1:150mJ.2:150mJ.3",agg.prot,0.05,outdir)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 12.5% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 6.23% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
across.275mJ = enrichK(add.int,"275mJ.1:275mJ.2:275mJ.3",agg.prot,0.05,outdir)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 13.64% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 5.59% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
across.400mJ = enrichK(add.int,"400mJ.1:400mJ.2:400mJ.3",agg.prot,0.05,outdir)
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(down.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 8.7% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## Warning in bitr(up.in.crosslink, fromType = "UNIPROT", toType =
## c("ENTREZID"), : 6.9% of input gene IDs are fail to map...
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
## 'select()' returned 1:many mapping between keys and columns
## 'select()' returned 1:1 mapping between keys and columns
all.kegg = rbind(across.9.kegg,across.150mJ,across.275mJ,across.400mJ)
write.table(all.kegg, paste(outdir,"KEGG-enrichment-for-enriched-proteins.txt",sep="/"),sep="\t",quote=F)
Not sure what to define the protein “universe” as. Used all of the proteins in the aggregaed list but this is not sufficient to run the KEGG analysis (throws a “not suffiecient members in group” error. Need to read a bit more about the inner workings of enrichKEGG to see if this can be changed.
Meanwhile, the overlapping proteins across all samples are enriched for the terms “Ribosome”,“Spliceosome”,“Protein processing in ER”,“Cell adhesion molecules” etc…Rayner concerned about presence of proteoglycans as these could be unwanted members entering the interface. Experiments are underway to check this.
I have also done an enrichment for proteins that were common within triplicate and each UV dosage. Get very similar terms as before (which is expected) and a few extra. The 150mJ dosage has the most number of significant KEGG mappings of the three dosages. There are a few pathways that aren’t enriched in the crosslinked sample (Down) but majority are. There are instances where the term “splisosome” appears in both enriched and un-enriched categories but the genes that contribute to this KEGG term are different in the enriched and unenriched cases. Might be worth pursuing these genes that in the unenriched category - they are heterogeneous nuclear riboneucleo protein and small nuclear ribonucleoprotein, RNA helicase and splicing factor subunit.
The 275mJ dosage has a high number of histones which map to pathways such as Systemic lupus erythematosus, Viral carcinogenesis, ECM-receptor interaction and Alcoholism which are a bit odd. If you remember, 275mJ has on average more proteins per sample than the other two time points. Perhaps this isn’t the ideal UV dosage for the study.